library(tidyverse)
library(lubridate)
library(caret)
library(nnet)
library(plotly)
library(randomForest)
options(browser = 'false')
First, we will import the raw data, clean them, and merge them according to our analysis needs. Then, we will take the monthly average to incorporate other data.
# function to clean and wrangle AIMS Moore Reef data to deal with column names and column placement
clean_MooreReef_data <- function(df) {
df[,2:ncol(df)] <- df[,1:ncol(df)]
df[,1] <- rownames(df)
df <- df %>% select(date, colnames(df)[ncol(df)])
return(df)
}
moore_reef_water_temp_2.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to16Feb2020_2.0m.csv", skip=108, sep= ",", row.names = NULL)
moore_reef_water_temp_9.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to17Dec2017_9.0m.csv", skip=94, sep= ",", row.names = NULL)
# run through function defined above to clean and wrangle data
moore_reef_water_temp_2.0m <- clean_MooreReef_data(moore_reef_water_temp_2.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_9.0m <- clean_MooreReef_data(moore_reef_water_temp_9.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_2.0m <- moore_reef_water_temp_2.0m %>%
filter(Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG != "Not available")
moore_reef_water_temp_9.0m <- moore_reef_water_temp_9.0m %>%
filter(Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG != "Not available")
# merge AIMS Moore reef temperature data
aims_temp_data <- Reduce(function(x,y) merge(x,y, by="date"), list(moore_reef_water_temp_2.0m,
moore_reef_water_temp_9.0m))
# convert water temp data from string to numeric type
aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG <-
as.numeric(as.character(aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG))
aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG <-
as.numeric(as.character(aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG))
# convert to date column into date type
aims_temp_data$date <- as.Date(aims_temp_data$date)
colnames(aims_temp_data) <- c("date", "avg_water_temp_2.0m_flat_site", "avg_water_temp_9.0m_slope_site")
# write combined AIMS Moore reef data
write.csv(aims_temp_data, "cleaned_data/aims_temperatures.csv", row.names = FALSE)
# convert date-decimal in coral cover to YYYY-MM-DD format
coral_cover <- read.csv("raw_data/trendgbr-coral-cover-with-ci.csv")
coral_cover$Date <- as.Date(format(date_decimal(coral_cover$Date), "%Y-%m-%d"))
# rename "Date" to "date"
names(coral_cover)[names(coral_cover) == "Date"] <- "date"
colnames(coral_cover) <- c("date", "mean_live_coral_cover_percent", "lower_conf_int", "upper_conf_int", "conf_int_span")
write.csv(coral_cover, "cleaned_data/coral_cover.csv", row.names = FALSE)
fish_census <- read.csv("raw_data/Fish census 1992-2015.csv", sep="\t", header=T)
fish_census <- fish_census %>% select(gbifID, class, family, genus, species, verbatimScientificName, decimalLatitude, decimalLongitude, dateIdentified)
fish_census$dateIdentified <- ymd_hms(fish_census$dateIdentified)
Group fish census by date, then count how many unique fish species were observed on a given date
fish_species_counts <- fish_census %>%
arrange(dateIdentified) %>%
group_by(dateIdentified) %>%
summarise(num_of_species=n_distinct(species))
## `summarise()` ungrouping output (override with `.groups` argument)
# rename "dateIdentified" to "date"
names(fish_species_counts)[names(fish_species_counts) == "dateIdentified"] <- "date"
fish_species_counts$date <- as.Date(fish_species_counts$date)
write.csv(fish_species_counts, "cleaned_data/fish_species_counts.csv", row.names = FALSE)
We would like to see whether we can classify sea grass species based on the survey location (latitude, longitude, and depth below sea level), and the type of sediment and seabed in which they were discovered.
We decided not to use sea grass species belonging to the Halophila genus because they are so widespread and common in tropical waters. Since the halophila genus were also present in nearly all the survey sites where other sea grass species were found, we determined that it did not make much sense from a scientific perspective, as well as from the data set we were given.
seagrass_data <- read.csv("raw_data/GBR_NESP-TWQ-3.2.1-5.4_JCU_Seagrass_1984-2018_Site-surveys.csv") %>%
# filter out for rows where we have information
filter(SEDIMENT != "Not recorded" & PRESENCE_A == "Present") %>%
# delete columns we are not going to use
select(-FID, -MONTH, -YEAR,
-SURVEY_MET, -SURVEY_NAM,
# remove seagrass belonging to halophila genus
-H_CAPRICOR, -H_TRICOSTA, -H_OVALIS, -H_UNINERVI, -H_DECIPIEN, -H_SPINULOS)
Let’s examine how many presence/absence we see for each sea grass species:
table(seagrass_data$C_ROTUNDAT)
##
## No Yes
## 30366 187
table(seagrass_data$C_SERRULAT)
##
## No Yes
## 28899 1654
table(seagrass_data$E_ACOROIDE)
##
## No Yes
## 30494 59
table(seagrass_data$S_ISOETIFO)
##
## No Yes
## 30353 200
table(seagrass_data$T_CILIATUM)
##
## No
## 30553
table(seagrass_data$T_HEMPRICH)
##
## No Yes
## 29674 879
table(seagrass_data$Z_CAPRICOR)
##
## No Yes
## 20074 10479
We see that there is actually no data at all for presence of T_CILIATUM. In addition, there are only 59 observations for E_ACOROIDE and 187 observations for C_ROTUNDAT that are actually useful in classifying these two species. Because we have a rather large data set, we want 200 or more observations for our classification model.So, we remove these columns and from our classification problem due to insufficient data.
seagrass_data <- seagrass_data %>% select(-C_ROTUNDAT, -T_CILIATUM, -E_ACOROIDE)
Since we deleted some columns, there may be some observations where all the remaining species columns have “No” values for absence. So, we filter out for rows where presence of at least one species was observed.
seagrass_data <- seagrass_data %>%
filter(C_SERRULAT=="Yes" |
S_ISOETIFO=="Yes" |
T_HEMPRICH=="Yes" |
Z_CAPRICOR=="Yes")
Now we want to count how many species were found at each location site.
count_species_present <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
count = 0
if (C_SERRULAT=="Yes") {
count <- count + 1
}
if (S_ISOETIFO=="Yes") {
count <- count + 1
}
if (T_HEMPRICH=="Yes") {
count <- count + 1
}
if (Z_CAPRICOR=="Yes") {
count <- count + 1
}
return(count)
}
seagrass_data$num_species_present <- mapply(count_species_present,
seagrass_data$C_SERRULAT,
seagrass_data$S_ISOETIFO,
seagrass_data$T_HEMPRICH,
seagrass_data$Z_CAPRICOR)
How many survey sites had more than 1 species discovered?
table(seagrass_data$num_species_present)
##
## 1 2 3
## 12381 411 3
nrow(seagrass_data)
## [1] 12795
We see that only 3.2% of total survey sites recorded more than 1 species observed. For ease of building a classification model, we will remove these rows. We do not want a situation where our model cannot “decide” in classifying our observations. Because we removed only about 3% of over 12,500 observations, we still preserve statistical power.
seagrass_data <- seagrass_data %>% filter(num_species_present < 2)
table(seagrass_data$C_SERRULAT)
##
## No Yes
## 11125 1256
table(seagrass_data$S_ISOETIFO)
##
## No Yes
## 12280 101
table(seagrass_data$T_HEMPRICH)
##
## No Yes
## 11558 823
table(seagrass_data$Z_CAPRICOR)
##
## No Yes
## 2180 10201
So, after some data wrangling and exploratory analysis, we will build a model to classify 4 species of seagrass: Cymodocea Serrulata , Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni). Now, let’s build a SPECIES column that collects all the presence in a single variable.
# function to make a species column
get_species_type <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
if (C_SERRULAT=="Yes") {
return("C_SERRULAT")
} else if (S_ISOETIFO=="Yes") {
return("S_ISOETIFO")
} else if (T_HEMPRICH =="Yes") {
return("T_HEMPRICH")
} else if (Z_CAPRICOR =="Yes") {
return("Z_CAPIRCOR")
}
}
# build species column to classify species of each observation based on presence/absence
seagrass_data$SPECIES <- mapply(get_species_type,
seagrass_data$C_SERRULAT,
seagrass_data$S_ISOETIFO,
seagrass_data$T_HEMPRICH,
seagrass_data$Z_CAPRICOR)
table(seagrass_data$SPECIES)
##
## C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
## 1256 101 823 10201
Data frame cleanup
# convert SPECIES to unordered factor
seagrass_data$SPECIES <- factor(seagrass_data$SPECIES, ordered=FALSE)
# rename misspelled column in original data set
names(seagrass_data)[names(seagrass_data) == "LATITUTDE"] <- "LATITUDE"
# only select columns relevant for our ML algorithm
seagrass_data <- seagrass_data %>%
select(SPECIES, LATITUDE, LONGITUDE, DEPTH, SEDIMENT, TIDAL)
# create negative depth for visualization purposes
seagrass_data$NEG_DEPTH <- -1 * seagrass_data$DEPTH
Write to .csv file.
write.csv(seagrass_data, "cleaned_data/seagrass_classification_data.csv", row.names = FALSE)
In this analysis, we will examine the effects of water temperature on the number of unique fish species observed in the Great Barrier Reef from 1997 to 2011.
Originally, we wanted to examine a relationship between water temperature and coral cover. However, we quickly saw that, when visualized, there does not seem to be a linear relation and we did not want to fit a mis-specified model.
Merge between temperature and coral cover data sets
temp <- read_csv("cleaned_data/aims_temperatures.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## avg_water_temp_2.0m_flat_site = col_double(),
## avg_water_temp_9.0m_slope_site = col_double()
## )
coral_cover <- read_csv("cleaned_data/coral_cover.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## mean_live_coral_cover_percent = col_double(),
## lower_conf_int = col_double(),
## upper_conf_int = col_double(),
## conf_int_span = col_double()
## )
temp_coral_cover <- merge(x=temp, y=coral_cover, by="date")
temp_coral_2m <- temp_coral_cover %>%
ggplot(aes(avg_water_temp_2.0m_flat_site, mean_live_coral_cover_percent)) +
geom_point() +
xlab("Avg water temperature at 2.0m (°C)") +
ylab("Mean coral cover percentage")
temp_coral_9m <- temp_coral_cover %>%
ggplot(aes(avg_water_temp_9.0m_slope_site, mean_live_coral_cover_percent)) +
geom_point() +
xlab("Avg water temperature at 9.0m (°C)") +
ylab("Mean coral cover percentage")
# display and save plots
temp_coral_2m
ggsave("viz/temp_coral_2m.png", plot=temp_coral_2m)
## Saving 7 x 5 in image
temp_coral_9m
ggsave("viz/temp_coral_9m.png", plot=temp_coral_9m)
## Saving 7 x 5 in image
As we can see, there is no linear relationship. While there are 2 or 3 clusters of coral cover percentage values, they seem to be pretty consistent across the range of water temperatures at 2.0m and 9.0 below sea level.
We also suspect that increase in water temperature could also affect diversity in sea life. So, we will examine the relationship between water temperature and the number of unique species of fish observed in the Great Barrier Ree. First, we can do some basic visualization by plotting the relationship between water temperatures at depth 2.0m and 9.0m with the number of unique fish species observedin the Great Barrier Reef.
fish <- read_csv("cleaned_data/fish_species_counts.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## num_of_species = col_double()
## )
fish_temp <- merge(x=temp, y=fish, by="date")
# water temp at 2.0m
fish_temp_2m <- fish_temp %>% ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) +
geom_point() +
xlab("Avg water temperature at 2.0m (°C)") +
ylab("Num. of unique fish species")
# water temp at92.0m
fish_temp_9m <- fish_temp %>% ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) +
geom_point() +
xlab("Avg water temperature at 9.0m (°C)") +
ylab("Num. of unique fish species")
# display and save plots
fish_temp_2m
ggsave("viz/fish_temp_2m.png", plot=fish_temp_2m)
## Saving 7 x 5 in image
fish_temp_9m
ggsave("viz/fish_temp_9m.png", plot=fish_temp_9m)
## Saving 7 x 5 in image
There seems to be a bit of negative linear relationship going on, so we will fit a linear model examining the number of unique species discovered in relation to rising temperature.
Now, we can split our data set into train and test sets, using 0.6 to partition our data. Our outcome is the mean coral cover percentage and water temperature is our covariate. We will fit 2 linear regression models: one examining effect of water temperature at 2.0m and the other examining the effect of temperature at 9.0m.
train_index <- createDataPartition(y=fish_temp$num_of_species, times=1, p = 0.6, list=FALSE)
train_set <- fish_temp[train_index, ]
test_set <- fish_temp[-train_index, ]
Fit linear regression model:
fish_temp_2.0m <- lm(num_of_species ~ avg_water_temp_2.0m_flat_site, data=train_set)
summary(fish_temp_2.0m)
##
## Call:
## lm(formula = num_of_species ~ avg_water_temp_2.0m_flat_site,
## data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.736 -9.756 0.550 9.425 38.614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 130.2206 17.3808 7.492 2.05e-12 ***
## avg_water_temp_2.0m_flat_site -2.4675 0.6342 -3.891 0.000135 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.2 on 203 degrees of freedom
## Multiple R-squared: 0.0694, Adjusted R-squared: 0.06481
## F-statistic: 15.14 on 1 and 203 DF, p-value: 0.0001354
fish_temp_9.0m <- lm(num_of_species ~ avg_water_temp_9.0m_slope_site, data=train_set)
summary(fish_temp_9.0m)
##
## Call:
## lm(formula = num_of_species ~ avg_water_temp_9.0m_slope_site,
## data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.851 -9.798 0.328 9.725 39.067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 133.942 17.657 7.586 1.17e-12 ***
## avg_water_temp_9.0m_slope_site -2.614 0.647 -4.041 7.56e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.16 on 203 degrees of freedom
## Multiple R-squared: 0.07445, Adjusted R-squared: 0.06989
## F-statistic: 16.33 on 1 and 203 DF, p-value: 7.556e-05
We see that the models are very similar in results. The coefficient with covariate 2.0m water temperature and 9.0 water temperature is -2.56 and -2.61, respectively.
Although we should not expect a major difference between how each of the two models performs, let’s compare them anyways to assess which water temperature depth is a better predictor of unique fish species observed in the Great Barrier Reef.
pred_2.0m <- predict(fish_temp_2.0m, test_set)
pred_9.0m <- predict(fish_temp_9.0m, test_set)
postResample(pred = pred_2.0m, obs = test_set$num_of_species)
## RMSE Rsquared MAE
## 15.15014383 0.08097776 11.25997201
postResample(pred = pred_9.0m, obs = test_set$num_of_species)
## RMSE Rsquared MAE
## 15.15118836 0.08046278 11.24961274
As expected, results are very similar.
We can assess this visually to confirm our results.
# water temp at 2.0m
fitted_fish_2m <- test_set %>%
ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) +
geom_point() +
geom_abline(intercept=fish_temp_2.0m$coefficients[1], slope=fish_temp_2.0m$coefficients[2], col="red") +
xlab("Avg water temperature at 2.0m (°C)") +
ylab("Num. of unique fish species")
# water temp at 9.0m
fitted_fish_9m <- test_set %>%
ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) +
geom_point() +
geom_abline(intercept=fish_temp_9.0m$coefficients[1], slope=fish_temp_9.0m$coefficients[2], col="blue") +
xlab("Avg water temperature at 9.0m (°C)") +
ylab("Num. of unique fish species")
# display and save plots
fitted_fish_2m
ggsave("viz/fitted_fish_2m.png", plot=fitted_fish_2m)
## Saving 7 x 5 in image
fitted_fish_9m
ggsave("viz/fitted_fish_9m.png", plot=fitted_fish_9m)
## Saving 7 x 5 in image
They both perform very similarly, and choosing either water temperature as our predictor will yield similar results.
Continuing on from sea life diversity, we have data on presence or absence of certain seagrass species in the Great Barrier Reef from 1999 to 2003. Let’s try to build a classifier to determine how location, and types of sediment and seabed may predict presence of certain sea grass.
As written in the data wrangling and cleaning RMarkdown/HTML file, we chose 4 species to classify: Cymodocea Serrulata, Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni).
seagrass <- read.csv("cleaned_data/seagrass_classification_data.csv", as.is =TRUE)
seagrass$SPECIES <- as.factor(seagrass$SPECIES)
seagrass$SEDIMENT <- as.factor(seagrass$SEDIMENT)
seagrass$TIDAL <- as.factor(seagrass$TIDAL)
head(seagrass)
## SPECIES LATITUDE LONGITUDE DEPTH SEDIMENT TIDAL NEG_DEPTH
## 1 Z_CAPIRCOR -23.65857 151.1206 0 Mud intertidal 0
## 2 Z_CAPIRCOR -23.65840 151.1212 0 Mud intertidal 0
## 3 Z_CAPIRCOR -23.65898 151.1203 0 Mud intertidal 0
## 4 Z_CAPIRCOR -23.65970 151.1197 0 Mud intertidal 0
## 5 Z_CAPIRCOR -23.65884 151.1205 0 Mud intertidal 0
## 6 Z_CAPIRCOR -23.65993 151.1197 0 Mud intertidal 0
Here are the summary statistics of the sea grass data we cleaned and wrangled.
summary(seagrass)
## SPECIES LATITUDE LONGITUDE DEPTH
## C_SERRULAT: 1256 Min. :-24.20 Min. :142.5 Min. : 0.0000
## S_ISOETIFO: 101 1st Qu.:-23.79 1st Qu.:146.8 1st Qu.: 0.0000
## T_HEMPRICH: 823 Median :-22.41 Median :150.5 Median : 0.0000
## Z_CAPIRCOR:10201 Mean :-21.06 Mean :149.0 Mean : 0.9253
## 3rd Qu.:-19.17 3rd Qu.:151.3 3rd Qu.: 1.2279
## Max. :-10.75 Max. :151.9 Max. :29.3583
##
## SEDIMENT TIDAL NEG_DEPTH
## Gravel: 1 intertidal:7433 Min. :-29.3583
## Mud :8969 subtidal :4948 1st Qu.: -1.2279
## Reef : 63 Median : 0.0000
## Rock : 66 Mean : -0.9253
## Rubble: 107 3rd Qu.: 0.0000
## Sand :3151 Max. : 0.0000
## Shell : 24
First we plot sea grass according to location (latitude and longitude). Then we will add a third axis (depth) to visualize this in 3-dimensions using plotly package. Since depth is measured in meters below sea level, we visualize this in negative values.
source("config.R")
Sys.setenv("plotly_username"=username)
Sys.setenv("plotly_api_key"=api_key)
seagrass_data_viz_2d <- seagrass %>% ggplot() +
geom_point(aes(x=LATITUDE, y=LONGITUDE, color=SPECIES)) +
ggtitle("Seagrass present in the Great Barrier Reef, 1999 - 2003")
seagrass_data_viz_2d
ggsave("viz/seagrass_data_viz_2d.png", plot=seagrass_data_viz_2d)
## Saving 7 x 5 in image
plotly_3d <- plot_ly(seagrass, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers") %>%
layout(title = "Seagrass present in the Great Barrier Reef, 1999 - 2003")
#api_create(plotly_3d, filename = "seagrass_exp_data_viz")
plotly_3d
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
We can also see how our categorical predictors relate to sea grass species.
seagrass_sediment <- seagrass %>%
ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
seagrass_tidal <- seagrass %>%
ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
seagrass_sediment
ggsave("viz/seagrass_sediment.png", plot=seagrass_sediment)
## Saving 7 x 5 in image
seagrass_tidal
ggsave("viz/seagrass_tidal.png", plot=seagrass_tidal)
## Saving 7 x 5 in image
We will first use random forest to build a classifier and then use a multinomial logistic regression model, and compare the two.
Let’s first partition our data set into train and test sets. Since we have a lot more data here than in the linear regression model, we will partition it by 75%-25%.
# test-train split
seagrass_train_ind <- createDataPartition(y = seagrass$SPECIES, p=0.75, list=FALSE)
train_set <- seagrass[seagrass_train_ind, ]
test_set <- seagrass[-seagrass_train_ind, ]
rf_fit <- randomForest(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT + TIDAL,
data=train_set,
mtry = 2)
rf_fit
##
## Call:
## randomForest(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT + TIDAL, data = train_set, mtry = 2)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.78%
## Confusion matrix:
## C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR class.error
## C_SERRULAT 702 21 13 206 0.25477707
## S_ISOETIFO 32 25 4 15 0.67105263
## T_HEMPRICH 48 1 562 7 0.09061489
## Z_CAPIRCOR 88 4 5 7554 0.01267808
rf_pred <- predict(rf_fit, newdata = test_set, type = "response")
confusionMatrix(table(pred = rf_pred, true = test_set$SPECIES))
## Confusion Matrix and Statistics
##
## true
## pred C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
## C_SERRULAT 246 7 14 24
## S_ISOETIFO 6 13 2 1
## T_HEMPRICH 1 0 179 0
## Z_CAPIRCOR 61 5 10 2525
##
## Overall Statistics
##
## Accuracy : 0.9577
## 95% CI : (0.95, 0.9645)
## No Information Rate : 0.8242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8558
##
## Mcnemar's Test P-Value : 1.744e-07
##
## Statistics by Class:
##
## Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity 0.78344 0.520000 0.87317
## Specificity 0.98381 0.997067 0.99965
## Pos Pred Value 0.84536 0.590909 0.99444
## Neg Pred Value 0.97574 0.996094 0.99108
## Prevalence 0.10149 0.008080 0.06626
## Detection Rate 0.07951 0.004202 0.05785
## Detection Prevalence 0.09405 0.007111 0.05818
## Balanced Accuracy 0.88363 0.758534 0.93641
## Class: Z_CAPIRCOR
## Sensitivity 0.9902
## Specificity 0.8603
## Pos Pred Value 0.9708
## Neg Pred Value 0.9493
## Prevalence 0.8242
## Detection Rate 0.8161
## Detection Prevalence 0.8407
## Balanced Accuracy 0.9252
We see that our classification model works quite well, especially for T_HEMPRICH and Z_CAPRICOR, which have 85%+ sensitivity and specificity. However, we got quite a low sensitivity for S_ISOETIFO. Recall to our data wrangling portion that S_ISOETIFO had only about 100 “Yes” observations. Since we had a small sample size for S_ISOETIFO relative to the other 3 seagrass species, this may have contributed to the low sensitivity.
We can visually assess our predicted values with true values of species to see how our model performed.
# true values
true_class <- plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers") %>%
layout(title = "True seagrass classification in test set")
true_class
#api_create(true_class, filename = "seagrass_true_classification")
# predicted values
predicted_class <- plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~rf_pred, type="scatter3d", mode="markers") %>%
layout(title = "Predicted seagrass classification in test set")
predicted_class
#api_create(predicted_class, filename = "seagrass_pred_classification")
For sediment type:
true_rf_sediment <- test_set %>%
ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge") + labs(fill="True species")
pred_rf_sediment <- test_set %>%
ggplot(aes(SEDIMENT, fill=rf_pred)) + geom_bar(width=.5, position = "dodge") + labs(fill="Predicted species")
true_rf_sediment
ggsave("viz/true_rf_sediment.png", plot=true_rf_sediment)
## Saving 7 x 5 in image
pred_rf_sediment
ggsave("viz/pred_rf_sediment.png", plot=pred_rf_sediment)
## Saving 7 x 5 in image
For seabed type:
true_rf_tidal <- test_set %>%
ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge") + labs(fill="True species")
pred_rf_tidal <- test_set %>%
ggplot(aes(TIDAL, fill=rf_pred)) + geom_bar(width=.5, position = "dodge") + labs(fill="Predicted species")
true_rf_tidal
ggsave("viz/true_rf_tidal.png", plot=true_rf_tidal)
## Saving 7 x 5 in image
pred_rf_tidal
ggsave("viz/pred_rf_tidal.png", plot=pred_rf_tidal)
## Saving 7 x 5 in image
Let’s examine variable importance.
variable_importance <- importance(rf_fit)
tmp <- data_frame(feature = rownames(variable_importance),
Gini = variable_importance[,1]) %>%
arrange(desc(Gini))
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
tmp
## # A tibble: 5 x 2
## feature Gini
## <chr> <dbl>
## 1 LATITUDE 998.
## 2 LONGITUDE 815.
## 3 DEPTH 609.
## 4 TIDAL 129.
## 5 SEDIMENT 75.6
We see that longitude and latitude were very predictive of presence of seagrass followed by depth from sea level. The types of sediment and seabed (intertidal or subtidal seabed) are not very good predictors. Thus, it seems that the location of where the sea grass was discovered matters more than the various ocean floor properties.
tmp %>% ggplot(aes(x=reorder(feature, Gini), y=Gini)) +
geom_bar(stat='identity') +
coord_flip() + xlab("Feature") +
theme(axis.text=element_text(size=8))
We can now try a multinomial logistic regression model to see how it compares to random forest. We will use the nnet package.
The logistic regression model is as follows:
multinom_fit <- multinom(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT, data=train_set)
## # weights: 44 (30 variable)
## initial value 12874.515732
## iter 10 value 3270.716174
## iter 20 value 2601.142207
## iter 30 value 2489.683000
## iter 40 value 2433.518856
## iter 50 value 2416.481441
## iter 60 value 2385.203109
## iter 70 value 2383.328069
## iter 80 value 2383.236112
## iter 90 value 2382.659333
## iter 100 value 2382.411013
## final value 2382.411013
## stopped after 100 iterations
summary(multinom_fit)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## multinom(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT,
## data = train_set)
##
## Coefficients:
## (Intercept) LATITUDE LONGITUDE DEPTH SEDIMENTMud SEDIMENTReef
## S_ISOETIFO -382.40287 2.0666240 2.6715999 0.02457381 24.940188 25.83904
## T_HEMPRICH -52.58375 1.5257042 0.4809432 -0.34456900 6.755413 10.08393
## Z_CAPIRCOR -291.21628 0.6769761 1.4751511 -0.77693752 89.918927 26.39204
## SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 26.21524 -24.49808 25.602863 26.55965
## T_HEMPRICH 10.90509 11.41330 9.928641 10.04138
## Z_CAPIRCOR 87.31451 85.69719 89.064339 88.54503
##
## Std. Errors:
## (Intercept) LATITUDE LONGITUDE DEPTH SEDIMENTMud
## S_ISOETIFO 0.0024742777 0.05772111 0.007798700 0.02456075 0.3746218
## T_HEMPRICH 0.0004550318 0.06191391 0.007840122 0.04190867 0.2940471
## Z_CAPIRCOR 0.0020779419 0.02873447 0.004627660 0.02901677 0.3547277
## SEDIMENTReef SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 6.199484e-01 0.7618304 NaN 0.3504961 0.9465096
## T_HEMPRICH 4.775093e-01 0.6031441 0.4402797 0.2603755 0.8929737
## Z_CAPIRCOR 5.077246e-26 0.6825874 1.0330768 0.3562248 0.8203911
##
## Residual Deviance: 4764.822
## AIC: 4824.822
Relative risk ratios where reference group is C_SERRULAT
exp(coef(multinom_fit))
## (Intercept) LATITUDE LONGITUDE DEPTH SEDIMENTMud SEDIMENTReef
## S_ISOETIFO 8.405111e-167 7.898114 14.463091 1.0248782 6.782440e+10 1.666292e+11
## T_HEMPRICH 1.456022e-23 4.598381 1.617599 0.7085257 8.586942e+02 2.395484e+04
## Z_CAPIRCOR 3.360287e-127 1.967918 4.371696 0.4598120 1.125366e+39 2.896801e+11
## SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 2.427350e+11 2.294145e-11 1.315777e+11 3.425384e+11
## T_HEMPRICH 5.445261e+04 9.051767e+04 2.050945e+04 2.295715e+04
## Z_CAPIRCOR 8.321628e+37 1.651272e+37 4.787965e+38 2.848501e+38
# predicted probabilities
predicted_prob <- predict(multinom_fit, newdata=test_set, type="probs")
# predicted classes
predicted_class <- predict(multinom_fit, newdata=test_set, type="class")
confusionMatrix(data = predicted_class, reference = test_set$SPECIES )
## Confusion Matrix and Statistics
##
## Reference
## Prediction C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
## C_SERRULAT 134 8 9 24
## S_ISOETIFO 1 0 0 2
## T_HEMPRICH 18 3 174 18
## Z_CAPIRCOR 161 14 22 2506
##
## Overall Statistics
##
## Accuracy : 0.9095
## 95% CI : (0.8988, 0.9194)
## No Information Rate : 0.8242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6644
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity 0.42675 0.0000000 0.84878
## Specificity 0.98525 0.9990225 0.98650
## Pos Pred Value 0.76571 0.0000000 0.81690
## Neg Pred Value 0.93834 0.9919120 0.98924
## Prevalence 0.10149 0.0080802 0.06626
## Detection Rate 0.04331 0.0000000 0.05624
## Detection Prevalence 0.05656 0.0009696 0.06884
## Balanced Accuracy 0.70600 0.4995112 0.91764
## Class: Z_CAPIRCOR
## Sensitivity 0.9827
## Specificity 0.6379
## Pos Pred Value 0.9271
## Neg Pred Value 0.8875
## Prevalence 0.8242
## Detection Rate 0.8100
## Detection Prevalence 0.8736
## Balanced Accuracy 0.8103
We see that our multinomial logistic model has about 91% overall accuracy, which performs a bit worse than random forest. However, the model performs very badly in predicting for S_ISOETIFO.
The model seems to predict T_HEMPRICH the best with 88.7% sensitivity and 98.8% specificity. The model also does not perform well for sensitivity of C_SERRULAT, with only about 41.7% sensitivity.
So, we see that the overall accuracy for multinomial logistic regression model vs random forest model was 95.6% and 90.9%, respectively. However, the overall accuracy stated for the logistic regression model is deceiving since it did not perform well in sensitivity in 2 out of 4 classes.